Final Project Checkpoint #3

Author
Affiliation

David Sneddon

Old Dominion University

Data

There is a wealth of data available from the Canadian government https://www.statcan.gc.ca/en/start. This includes interprovincial trade amongst the Canadian provinces and even international trade between Canadian provinces and US States in addition to provinicial GDP data. Much of the US data is available from the US Census bureau, including state GDP GDP and population.

Geographic data for Canada was more difficult to come by, as I diverged from the original work done by McCallum by using population centroids as opposed to distance to principal cities. The US population centers were available from the Census bureau, however the Canadian counterparts were not. I ended up using ChatGPT to calculate these centroids (noted in the code). Summary statistics for the data to be used in the model are listed below both with and without a logarithmic transformation.

Code
#LIBRARIES
library(readr)
library(geosphere)
library(utf8)
library(statcanR)
library(bea.R)
library(censusapi)
library("ggplot2")
library("sf")
library("rnaturalearth")
library("rnaturalearthdata")
library("canadianmaps")
library(plyr)
library(stringr)
library(usmap)
library(spData) 
library("tools")
library("modelsummary")
library(AER)
options("modelsummary_factory_default" = "kableExtra")
options(scipen = 999)

#FUNCTION FOR CREATING MAPS
gravitymap <- function (x){
  theme_set(theme_bw())
  sf_use_s2(FALSE)
  
  worldz <- ne_countries(scale = "large", returnclass = "sf")
  class(worldz)
  (sites <- data.frame(longitude = c(-172.54, -47.74), latitude = c(23.81,
                                                                    90)))
  
  
  (sites <- st_as_sf(geodata, coords = c("lon", "lat"),
                     crs=4369, agr = "constant"))
  
  sites <- st_transform(sites, st_crs("ESRI:102010"))
  states <- st_transform(us_states, st_crs("ESRI:102010"))
  head(states)
  provs <- st_sf(PROV[,c(5, 10, 11, 12)])
  ak <- st_transform(alaska, st_crs("ESRI:102010"))
  hi <- st_transform(hawaii, st_crs("ESRI:102010"))
  akhi <- rbind(ak,hi)
  akhi <- cbind(akhi, st_coordinates(st_centroid(akhi)))
  akhi <- st_sf(akhi[c(2,7,8,9)])
  provs <- st_transform(provs, st_crs("ESRI:102010"))
  
  
  states <- cbind(states, st_coordinates(st_centroid(states)))
  states <- states[c(2, 7, 8, 9)]
  colnames(provs) <- colnames(akhi) <- colnames(states)
  
  head(states)

  
  head(states)
  stpr <- st_sf(rbind(as.data.frame(states),
                      as.data.frame(provs),
                      as.data.frame(akhi)))
  
  tf <- data.frame(us_ca_split[x])
  prov_abr2 <- prov_abr
  colnames(prov_abr2) <- colnames(st_abbr)
  tabrs <- rbind(st_abbr, prov_abr2)
  colnames(tabrs)[1] <- "NAME"
  colnames(tf) <- colnames(us_ca)
  tf$value <- log(tf$value)
  tf <- tf[c(2, 7)]
  colnames(tf)[1] <- "State"
  tstpr <- merge(stpr, tabrs, by = "NAME")
  tstpr <- merge(tstpr, tf, by = "State")
  
  ggplot(data = worldz) +
    geom_sf() +
    ggtitle(paste("Import Gravity Map for", prov_abr[prov_abr$from==x,])) +
    geom_sf(data = tstpr, 
            aes(fill =value)) +
    scale_fill_gradient(low="red", high= "green") +
    coord_sf(xlim = c(-6500000, 3800000), 
             ylim = c(4700000, -1700000), 
             expand = FALSE, 
             crs = st_crs("ESRI:102010"))

}


#DATA IMPORTS
url_df <- read.csv("url_df.csv")
usfips <- read.csv("https://www2.census.gov/geo/docs/reference/state.txt",
                   sep = "|")
usfips <- usfips[c(1, 2)]
colnames(usfips) <- c("STATE", "state")

##GEOGRAPHIC DATA
###Centers of Population
st_lat_long <- read_csv("st_pop_center.csv") #From Census.gov - 2020
canprov <- read_csv("canprovcenter.csv") #Calculated with ChatGPT - 2021
st_abbr <- read_csv("st_abbr.csv")



#DISTANCE DATA
geodata <- as.data.frame(rbind(as.matrix(st_lat_long),
                               as.matrix(canprov)))
geoframe <- data.frame(matrix(0, nrow(geodata)^2, 3))
colnames(geoframe) <- c("from", "to", "km")
c <- 1
for (i in geodata[,1]){
  for (w in geodata[,1]){
    
    lat_a <- as.numeric(geodata[geodata$state==i,2])
    lat_b <- as.numeric(geodata[geodata$state==w,2])
    lon_a <- as.numeric(geodata[geodata$state==i,3])
    lon_b <- as.numeric(geodata[geodata$state==w,3])
    geoframe[c,1] <- i
    geoframe[c,2] <- w
    geoframe[c,3] <- as.numeric(distHaversine(c(lon_a, lat_a),
                                              c(lon_b, lat_b)))/1000
    c <- c+1
  }
}
geoframe["fttag"] <- paste0(geoframe$from,geoframe$to)
geoframe <- geoframe[,c(3, 4)]




##CANADIAN DATA
can_exports <- read.csv(url_df$candata)
can_exports <- can_exports[,c(2, 4, 5, 12)]
#DATAFRAME FOR US STATE AND PROVINCIAL ABBREVIATIONS FOR UNIFORMITY
#NEEDED FOR MERGING
prov_abr <- data.frame(GEO = unique(can_exports$GEO))
prov_abr["from"] <- c("NL", "PE", "NS", "NB", "QC", "ON", "MB", "SK", "AB", 
                      "BC", "YT", "NT", "NU")
provto_abr <- prov_abr
provto_abr$GEO <- paste("To", prov_abr$GEO)
colnames(provto_abr) <- c("To_GEO","to")
colnames(can_exports)[2] <- "To_GEO"
#FORMAT can_exports  FOR MERGING
can_exports <- merge(can_exports, prov_abr, by="GEO")
can_exports <- merge(can_exports, provto_abr, by="To_GEO")
rm(provto_abr)
can_exports["value"] <- can_exports$VALUE*1000
can_exports <- can_exports[,-(1:4)]
can_exports["fttag"] <- paste0(can_exports$from,can_exports$to)
can_exports["intra"] <- -1
for (i in 1:nrow(can_exports)){
  if(can_exports$from[i]==can_exports$to[i]){
    can_exports$intra[i] <- 1
  }else
    can_exports$intra[i] <- 0
}
can_exports <- can_exports[can_exports$intra == 0,]
can_exports$intra <- NULL
tmp <- can_exports
can_exports <- tmp[,c(2,1,3,4)]
rm(tmp)
can_exports["domestic"] <- 1


###IMPORT AGGREGATE AND CONDENSE CAN_IMPORTS DATAFRAME
can_imports <- read.csv("ODPFN022_201912N.csv")
can_imports <- can_imports[can_imports$Country.Pays=="US",]
colnames(can_imports) <- c("YearMonth", 
                           "HS2", 
                           "Country", 
                           "Province", 
                           "State", 
                           "Value")
can_imports <- aggregate(can_imports$Value
                         ~ can_imports$Country
                         + can_imports$Province
                         + can_imports$State,
                         FUN = sum)
for (i in 1:nrow(can_imports)){
  for (u in 1:3){
    can_imports[i,u] <- as_utf8(can_imports[i,u])
  }
}
can_imports <- can_imports[,c(2, 3, 4)]
colnames(can_imports) <- c("to", "from", "value")
can_imports["fttag"] <- paste0(can_imports$from,can_imports$to)
can_imports["domestic"] <- 0

###CANADIAN GDP
can_gdp <- read.csv(url_df$can_gdp)
can_gdp <- can_gdp[,c("GEO","VALUE")]
can_gdp <- merge(can_gdp, prov_abr, by="GEO")
cagdp_f <- data.frame(from=can_gdp$from,
                      from_gdp=can_gdp$VALUE)
cagdp_t <- data.frame(to=can_gdp$from,
                      to_gdp=can_gdp$VALUE)
##MERGE CANADIAN IMPORTS AND EXPORTS

##US DATA
bkey <- "EB73777D-FF2C-43F1-A94A-E1BFFBA2744D"
userSpecList <- list('UserID' = bkey,
                     'Method' = 'GetData',
                     'datasetname' = 'Regional',
                     'GeoFips' = 'STATE',
                     'LineCode' = '3',
                     'TableName' = 'SAGDP1',
                     'Year' = '2019')
us_gdp <- beaGet(userSpecList)
us_gdp <- us_gdp[-1,]
us_gdp <- us_gdp[-(52:59),]
us_gdp <- us_gdp[,c(3, 6)]
exch_usca <- 1.3269 #USD TO CAD 2019 https://www.bankofcanada.ca/rates/exchange/annual-average-exchange-rates/
us_gdp <- merge(us_gdp, st_abbr, by="GeoName")
usgdp_f <- data.frame(from=us_gdp$State,
                      from_gdp=as.numeric(us_gdp$`DataValue_2019`)*exch_usca)
usgdp_t <- data.frame(to=us_gdp$State,
                      to_gdp=as.numeric(us_gdp$`DataValue_2019`)*exch_usca)
rm(us_gdp, exch_usca)
#MERGE BOTH COUNTRIES GDPs
gdp_f <- rbind(cagdp_f,usgdp_f)
rm(cagdp_f, usgdp_f)
gdp_t <- rbind(cagdp_t,usgdp_t)
rm(cagdp_t, usgdp_t)
##POPULATION
###US
st_pop <- getCensus(
  name = "dec/dhc",
  vars = "P1_001N",
  region = "state:*",
  vintage = 2020
)
st_pop$state <- as.numeric(st_pop$state)
colnames(st_pop) <- c("STATE", "pop")
st_pop <- merge(st_pop, usfips, by="STATE")
st_pop <- st_pop[,c(3, 2)]
st_pop_from <- data.frame(st_pop$state, st_pop$pop)
st_pop_to <- data.frame(st_pop$state, st_pop$pop)
colnames(st_pop_from) <- c("from","from_pop")
colnames(st_pop_to) <- c("to","to_pop")

###CANADA

prov_pop <- read.csv("prov_pop.csv")
prov_pop <- prov_pop[prov_pop$CHARACTERISTIC_ID==1 
                     & prov_pop$ALT_GEO_CODE !=  1, c(5, 12)]
colnames(prov_pop) <- c("GEO", "pop")
prov_pop <- merge(prov_pop, prov_abr, by="GEO")
prov_pop$to <- prov_pop$from
prov_pop_from <- data.frame(prov_pop$from, prov_pop$pop)
prov_pop_to <- data.frame(prov_pop$to, prov_pop$pop)
colnames(prov_pop_from) <- c("from","from_pop")
colnames(prov_pop_to) <- c("to","to_pop")

pop_from <- rbind(st_pop_from,prov_pop_from)
pop_to <- rbind(st_pop_to,prov_pop_to)
#PULL TOGETHER can_exports, can_imports, geoframe, populations, and GDPs
us_ca <- rbind(can_imports, can_exports)
us_ca <- merge(us_ca, geoframe, by = "fttag")
us_ca <- merge(us_ca, gdp_f, by = "from")
us_ca <- merge(us_ca, gdp_t, by = "to")
us_ca <- us_ca[,c(2,1,7,8,5,6,4)]
us_ca <- us_ca <- merge(us_ca, pop_from, by = "from")
us_ca <- us_ca <- merge(us_ca, pop_to, by = "to")
us_ca <- us_ca[us_ca$value != 0,]
us_ca <- us_ca[us_ca$to != "NU" & us_ca$to != "NT" & us_ca$to != "YT" &
                 us_ca$from != "NU" & us_ca$from != "NT" & us_ca$from != "YT",]
#SPLIT TO EACH PROVINCE
us_ca_split <- split(us_ca, list(us_ca$to))
#REGRESSION

coefnames <- c("Intercept","From GDP", "To GDP", "Distance (km)", "Intl. Dummy" )

reg1 <- lm(log(value[domestic==1])
           ~ log(from_gdp[domestic==1])
           + log(to_gdp[domestic==1])
           + log(km[domestic==1])
           + domestic[domestic==1],
           data = us_ca)
names(reg1$coefficients) <- coefnames

reg2 <- lm(log(value)
           ~ log(from_gdp)
           + log(to_gdp) 
           + log(km) 
           + domestic, 
           data=us_ca)

names(reg2$coefficients) <- coefnames

reg3 <- lm(log(value[(from_gdp > 10^4) & (to_gdp > 10^4)])
           ~ log(from_gdp[(from_gdp > 10^4) & (to_gdp > 10^4)])
           + log(to_gdp[(from_gdp > 10^4) & (to_gdp > 10^4)]) 
           + log(km[(from_gdp > 10^4) & (to_gdp > 10^4)]) 
           + domestic[(from_gdp > 10^4) & (to_gdp > 10^4)], 
           data=us_ca)

names(reg3$coefficients) <- coefnames

reg4 <- lm(log(value)
           ~ log(from_gdp)
           + log(to_gdp) 
           + log(km) 
           + domestic, 
           data=us_ca,
           weights = from_gdp + to_gdp)

names(reg4$coefficients) <- coefnames


reg5 <- lm(log(value)
           ~ log(from_pop)
           + log(to_pop) 
           + log(km) 
           + domestic, 
           data=us_ca)

names(reg5$coefficients) <- coefnames

regz <- list(reg1, reg2, reg3, reg4, reg5)

plot ((us_ca$km), log(us_ca$value),
      pch = 20,
      cex = 1,
      main = "Distance vs Total Goods Imports",
      xlab = "Distance (km)",
      ylab = "log(Goods Imports)",
      col = alpha(us_ca$domestic+2,0.5),
      las = 1
      )
legend("topright",
       pch = 20,
       bty = "n",
       cex = 0.75,
       horiz = FALSE,
       legend = c("International", "Interprovincial"),
       col = c(2, 3))


title <- "US Canada Trade Data"
frmla <- (`Imports` = value) +
  (`Distance (km)` = km) + 
  (`Export GDP` = from_gdp) +
  (`Import GDP` = to_gdp) ~ 
  (`N` = length) + 
  Mean + 
  (`St. Dev.` = sd) + 
  (`Min` = min) + 
  (`Max` = max)
frmlalog <- (`Imports` = log(value)) +
  (`Distance (km)` = log(km)) + 
  (`Export GDP` = log(from_gdp)) +
  (`Import GDP` = log(to_gdp)) ~ 
  (`N` = length) + 
  Mean + 
  (`St. Dev.` = sd) + 
  (`Min` = min) + 
  (`Max` = max)
#Output a table
datasummary(frmla, 
            data = us_ca, 
            title = paste(title, "- Summary Statistics"),
            fmt = fmt_significant(2))
datasummary(frmlalog, 
            data = us_ca, 
            title = paste(title, "- Summary Statistics - log"), 
            fmt = fmt_significant(2))
modelsummary(regz, 
             data = us_ca, 
             title = paste(title, "- Regression"),
             fmt = fmt_significant(2))
US Canada Trade Data - Summary Statistics
N Mean St. Dev. Min Max
Imports 580 819851002 2556957722 67 27905413372
Distance (km) 580 2298 1296 134 8678
Export GDP 580 519082 682862 6778 4071765
Import GDP 580 221674 252042 6778 833629
US Canada Trade Data - Summary Statistics - log
N Mean St. Dev. Min Max
Imports 580 17 3.8 4.2 24
Distance (km) 580 7.6 0.67 4.9 9.1
Export GDP 580 13 1.2 8.8 15
Import GDP 580 12 1.4 8.8 14
US Canada Trade Data - Regression
 (1)   (2)   (3)   (4)   (5)
Intercept 6.48 −5.2 −3.8 −9.1 −17
(0.86) (1.8) (2.1) (1.7) (2)
From GDP 0.984 1.196 1.186 1.231 1.322
(0.046) (0.086) (0.095) (0.073) (0.086)
To GDP 0.935 1.694 1.631 1.698 1.799
(0.046) (0.069) (0.088) (0.061) (0.071)
Distance (km) −1.168 −1.68 −1.74 −1.24 −1.63
(0.074) (0.14) (0.16) (0.13) (0.14)
Intl. Dummy 4.34 4.1 3.8 4.00
(0.28) (0.3) (0.3) (0.26)
Num.Obs. 90 580 521 580 580
R2 0.908 0.656 0.604 0.678 0.673
R2 Adj. 0.905 0.654 0.600 0.675 0.671
AIC 22614.7 24150.6 22585.8
BIC 22640.9 24176.8 22612.0
Log.Lik. −80.090 −1291.561 −1165.818 −1319.909 −1277.117
RMSE 0.59 2.24 2.27 2.28 2.19
Plot

Empirical Model

\[log(\widehat{Im}_p)=\beta_0 + \beta_1log(Y_{Ex}) + \beta_2log(Y_{Im}) + \beta_3log(d) + \beta_4B + \varepsilon \] Where \(log(\widehat{Im}_p)\) is the estimated imports, \(Y_{Ex}\) is the GDP of the exporting province or state, \(Y_{Im}\) is the GDP of the importing province, \(d\) is the distance in kilometers from each province or states population center, and \(B\) is a dummy variable for whether the trade crosses an international border.

Note that the 5th regression in the summary statistics uses population in lieu of GDP.

I used \(log-log\) regression as there are extreme variances in the data where states with exceptionally high GDP’s such as Texas, while being one of the most distant states from the Canadian border still has a significant amount of exports. Using logarithms stabilizes this effect.

There are five variations of the regresson:
1. Canada Only
2. Canada and USA
3. Canada and USA with GDP greater that $10 billion
4. Canada and USA weighted by GDP
5. Canada and USA with population in lieu of GDP

Maps

Alberta British Columbia Manitoba

New Brunswick Newfoundland Nova Scotia

Ontario Prince Edward Island Quebec Saskatchewan